suppressPackageStartupMessages({
library("EnhancedVolcano")
library("limma")
library("biomaRt")
library("annotables")
library("tximport")
library("apeglm")
library("eulerr")
library("DESeq2")
library("HGNChelper")
library("tictoc")
library("DESeq2")
library("kableExtra")
library("beeswarm")
library("missMethyl")
library("gridExtra")
library("png")
library("metafor")
library("ggplot2")
library("purrr")
library("metafor")
  library("AnnotationDbi")

library("readxl")
library("ggplot2")
library("tidyverse")
library("magrittr")
library("readr")
library("eulerr")
library("RColorBrewer")
library("e1071")
library("plotly") 
library("pheatmap")
library("msigdbr")
library("mitch")
  library("rtracklayer")
  library("dplyr")
  library("org.Mm.eg.db")
  library("fgsea")
  
})

CORES=16

Read the sample manifest and alignment summary

I ran two types of aligment with kallisto (with and without ncRNA), I have comapred the result of the alingement and choosed the cDNA+ncRNA as the aligment since the aligment percentage is higher in this combo

base<- "/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/"
idx<- "/mnt/vol1/Mouse_model_RNA_Seq/index"

samples<- read_tsv(file.path(base,"samples.txt"),col_names = "sample", show_col_types = FALSE)
samples$path <- file.path(base, samples$sample, "abundance.tsv")

align<- read_delim("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/alignment_summary.tsv")

old_align<-read_csv("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results/aligment_no_ncrna.csv", show_col_types = FALSE)

new_df<- align%>% mutate(index = "cDNA + ncRNA", rate = 100 * pseudoaligned/processed) %>%
 select(sample, index, rate)
old_df <- old_align %>% mutate(index = "cDNA only",                               rate = 100 * pseudoaligned/processed) %>%
  select(sample, index, rate)

df<- bind_rows(new_df, old_df)


delta<- df %>% pivot_wider(names_from = index, values_from = rate) %>%
  mutate(delta = `cDNA + ncRNA` - `cDNA only`) %>%
  arrange(delta)
df$sample <- factor(df$sample, levels = delta$sample)

ggplot(df, aes(sample, rate, fill = index)) +
  geom_col(position = position_dodge(width = 0.75), width = 0.7) +
  labs(title = "Pseudoalignment rate by sample",
       x = "Sample", y = "Rate (%)", fill = "Run") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Import transcripts to gene level via trimport package

tx2gene<- read.csv(file.path("/mnt/vol1/Mouse_model_RNA_Seq/tx2gene_r115.csv"), stringsAsFactors = FALSE)
stopifnot(all(c("TXNAME","GENEID") %in% names(tx2gene)))

files<- file.path(base, samples$sample, "abundance.tsv")
names(files) <- samples$sample
stopifnot(all(file.exists(files)))

txi<- tximport(files,
                type = "kallisto",
                tx2gene = tx2gene[, c("TXNAME","GENEID")],
                countsFromAbundance = "lengthScaledTPM",
                ignoreTxVersion = TRUE)

txi$counts[1:2, 1:6] 
##                       BB329    BB330    BB331    BB332    BB334    BB335
## ENSMUSG00000000001 3423.529 4452.743 3423.237 3360.252 4088.225 2816.038
## ENSMUSG00000000003    0.000    0.000    0.000    0.000    0.000    0.000

Reading meta data that comes with the samples

pheno<- read_excel("/mnt/vol1/Mouse_model_RNA_Seq/minipump.xlsx",skip =1 )
colnames(pheno)
## [1] "Sample"                     "Sex"                       
## [3] "Intervention"               "RNA concentration\r\nng/ul"
## [5] "RIN"                        "Well Number"
coldata<- pheno %>%
  filter(Sample %in% samples$sample) %>%
  arrange(match(Sample, samples$sample)) %>%
  mutate(
    Sex = factor(Sex, levels = c("F","M")), # F as baseline
    Treatment = factor(Intervention, levels = c("Saline","Ang-II"))) # saline as baseline

rownames(coldata)<- coldata$Sample

Making DESeqDataSet from tximport

dds<- DESeqDataSetFromTximport(txi, colData = coldata, design = ~ Sex + Treatment + Sex:Treatment)

Filtering for low count and normalisation of the counts

keep<- rowSums(counts(dds)) >= 10
dds<- dds[keep, ]

dds<- estimateSizeFactors(dds)
norm_counts<- counts(dds, normalized = TRUE)
glimpse(norm_counts)
##  num [1:41594, 1:17] 3210.9 157.5 1444.1 196.9 10.3 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:41594] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
##   ..$ : chr [1:17] "BB329" "BB330" "BB331" "BB332" ...

Unsupervised clustering analyses

vsd<- vst(dds, blind = TRUE)

ann<- as.data.frame(colData(dds))[, c("Sex","Treatment"), drop = FALSE]

#sample–sample correlation heatmap
pheatmap(cor(assay(vsd)),
         annotation_col = ann,
         annotation_row = ann) 

# PCA (colour = Treatment, shape = sex)
plotPCA(vsd, intgroup = c("Treatment","Sex"))

## Fit model (set baselines)
dds$Sex<- relevel(factor(dds$Sex), "F")
dds$Treatment<- relevel(factor(dds$Treatment), "Saline")
design(dds)<- ~ Sex + Treatment + Sex:Treatment
dds<- DESeq(dds)
resultsNames(dds)
## [1] "Intercept"                  "Sex_M_vs_F"                
## [3] "Treatment_Ang.II_vs_Saline" "SexM.TreatmentAng.II"
# baseline (saline): M vs F
coef_sex<- "Sex_M_vs_F"   

# Ang-II vs Saline in females (F is baseline sex)
coef_trt<- "Treatment_Ang.II_vs_Saline"  

# (Ang-II effect in M) & (Ang-II effect in F)
coef_int<- "SexM.TreatmentAng.II"        

RAW results vs Shrinking the log2 fold changes

With LFC filtering

### RAW results ###

# 1. Within each Sex
# 1.1 Female: Ang-II vs Saline  
res_F_raw<- results(dds, name = coef_trt)
# 1.2 Male: Ang-II vs Saline  
res_M_raw<- results(dds, contrast = list(c(coef_trt, coef_int)))

# 2. Effect of Ang-II treatment between sex
res_int_raw<- results(dds, name = coef_int)

# 3. Baseline (saline): M vs F
res_bsl_raw<- results(dds, name = coef_sex)

# Shrinkage results
res_F_shr<- lfcShrink(dds, coef = coef_trt, type = "ashr")
res_M_shr<- lfcShrink(dds, contrast= list(c(coef_trt, coef_int)), type = "ashr")
res_int_shr<- lfcShrink(dds, coef = coef_int, type = "ashr")
res_bsl_shr<- lfcShrink(dds, coef = coef_sex, type = "ashr")

make_df<- function(raw, shr, label){
  data.frame(
    gene = rownames(raw),
    lfc_raw= raw$log2FoldChange,
    lfc_shr = shr$log2FoldChange,
    FDR_raw = raw$padj,           
    contrast = label,
    stringsAsFactors = FALSE)
}

res_all<- bind_rows(
  make_df(res_F_raw, res_F_shr, "Female: Ang-II vs Saline"),
  make_df(res_M_raw, res_M_shr, "Male: Ang-II vs Saline"),
  make_df(res_int_raw, res_int_shr,"Interaction: (M & F) treatment effect"),
  make_df(res_bsl_raw, res_bsl_shr, "Baseline (Saline): M vs F")
) 

# Raw vs Shrunken
res_long<- res_all %>%
  mutate(
    hit_raw= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_raw)>= 0.5,
    hit_shr= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_shr)>= 0.5
  ) %>%
  dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, hit_raw, hit_shr) %>%
  tidyr::pivot_longer(
    cols = c(lfc_raw, lfc_shr, hit_raw, hit_shr),
    names_to= c(".value", "lfc_type"),
    names_sep= "_"              
  ) %>%
  mutate(
    lfc_type = dplyr::recode(lfc_type,
      raw = "Raw LFC",
      shr = "Shrunken LFC"),
    lfc_type = factor(lfc_type, levels = c("Raw LFC", "Shrunken LFC")),
    contrast = factor(
      contrast,
      levels = c(
        "Female: Ang-II vs Saline",
        "Male: Ang-II vs Saline",
        "Interaction: (M & F) treatment effect",
        "Baseline (Saline): M vs F"
      )
    )
  )


x_max<- quantile(abs(res_long$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))

ggplot(res_long, aes(lfc, -log10(FDR_raw), colour = hit)) +
  geom_point(alpha = 0.6, size = 1.2) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = 2) +
  geom_hline(yintercept = -log10(0.05), linetype = 2) +
  coord_cartesian(xlim = x_lim, ylim = y_lim) +
  facet_grid(contrast ~ lfc_type, switch = "y") +
  scale_colour_manual(values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
                      labels = c("FALSE" = "FDR>0.05 or LFC<0.5", "TRUE" = "FDR<=0.05 &  LFC>=0.5"),name = NULL) +
  labs(
    title = "Volcano comparison: Raw vs Shrunken (ashr) with LFC filtering",
    x = "log2 fold change",
    y = expression(-log[10]("FDR from unshrunken"))) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "bottom",
    strip.placement = "outside",
    strip.background = element_blank(),
    strip.text.x = element_text(face = "bold"),
    strip.text.y.left = element_text(angle = 0, face = "bold"),
    panel.grid.minor = element_blank())

## Without LFC filtering

# Raw vs Shrunken
res_long_nolfc<- res_all %>%
  mutate(
    sig_raw = !is.na(FDR_raw) & FDR_raw <= 0.05,
    sig_shr = !is.na(FDR_raw) & FDR_raw <= 0.05
  ) %>%
  dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, sig_raw, sig_shr) %>%
  tidyr::pivot_longer(
    cols= c(lfc_raw, lfc_shr, sig_raw, sig_shr),
    names_to= c(".value", "lfc_type"),
    names_sep= "_") %>%
  mutate(
    lfc_type = recode(lfc_type,
      raw = "Raw LFC (before)",
      shr = "Shrunken LFC (after)"),
    lfc_type = factor(lfc_type, levels = c("Raw LFC (before)", "Shrunken LFC (after)")),
    contrast= factor(contrast, levels = c(
      "Female: Ang-II vs Saline",
      "Male: Ang-II vs Saline",
      "Interaction: (M & F) treatment effect",
      "Baseline (Saline): M vs F"
    )))

x_max<- quantile(abs(res_long_nolfc$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long_nolfc$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))

ggplot(res_long_nolfc, aes(lfc, -log10(FDR_raw), colour = sig)) +
  geom_point(alpha = 0.6, size = 1.2) +
  geom_hline(yintercept = -log10(0.05), linetype = 2) +
  coord_cartesian(xlim = x_lim, ylim = y_lim) +
  facet_grid(contrast ~ lfc_type, switch = "y") +
  scale_colour_manual(
    values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
    labels = c("FALSE" = "FDR > 0.05", "TRUE" = "FDR ≤ 0.05"),
    name = NULL) +
  labs(
    title = "Volcano comparison: Raw vs Shrunken (ashr) without LFC filtering",
    x = "log2 fold change",
    y = expression(-log[10]("FDR (from unshrunken fit)"))) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "bottom",
    strip.placement = "outside",
    strip.background = element_blank(),
    strip.text.x = element_text(face = "bold"),
    strip.text.y.left = element_text(angle = 0, face = "bold"),
    panel.grid.minor = element_blank())

Diagnostics

The interaction coefficient (SexM.TreatmentAng.II) is a difference-in-differences. If β_int ~ 0 for most genes, it means the Ang-II effect is about the same in males and females. Therefore, it could be assume that there can be still many DE genes within each sex but with small effects, however, the extra difference between males and females could almost be negligible for Ang-II.

Reduced model The LRT is comparing the full model to the reduced model to identify significant genes. The p-values are determined solely by the difference in deviance between the ‘full’ and ‘reduced’ model formula (not log2 fold changes). Essentially the LRT test is testing whether the term(s) removed in the ‘reduced’ model explains a significant amount of variation in the data. Quoted https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html

Context:

So in the case, where females are baseline/ reference; * β_int > 0: Ang-II effect is larger in males than females.

  • β_int < 0: Ang-II effect is smaller in males (or larger in females).

Results of the preliminary states that;

  • Group sizes: F (4 Ang-II, 3 Saline), M (4 Ang-II, 6 Saline) = unbalanced and small

  • F vs M LFC scatter: global Pearson r ≈ 0.056 is small but could be because ~95% of genes are near 0, hence noise dominates the correlation.

  • Summary of the full model (with interaction) to the reduced model (without interaction) gene-by-gene, shows ~0–4 genes at FDR <0.1 (and 0–1 at FDR < 0.05) out of 41594 genes. Hence it could be deduced that the interaction term adds almost no explanatory power overall.

  • Effect-size difference (M & F) distribution of median −0.023, centred near 0, further proving that Ang-II effect size is similar in males and females overall.

  • The histogram of LFC shows that the values of the res_F_raw and res_M_raw is centred around zero(median −0.023), indicating that Ang-II effects are broadly similar between sexes, consistent with the likelihood-ratio test showing negligible evidence for an interaction

# No.of saline and treatment samples of each sex
table(pheno$Sex, pheno$Intervention)
##    
##     Ang-II Saline
##   F      4      3
##   M      4      6
# correlation of the interaction 
summary(res_F_raw$lfcSE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.05636 0.17710 0.40386 0.67548 0.90210 4.80267
summary(res_M_raw$lfcSE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04774 0.15084 0.35101 0.58459 0.78546 4.05917
cor.test(res_F_raw$log2FoldChange, res_M_raw$log2FoldChange, use="complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  res_F_raw$log2FoldChange and res_M_raw$log2FoldChange
## t = 11.464, df = 41592, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04653857 0.06569855
## sample estimates:
##        cor 
## 0.05612373
lfc_F<- res_F_raw$log2FoldChange         
lfc_M<- res_M_raw$log2FoldChange       
delta<- lfc_M - lfc_F   
summary(delta)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -30.00000  -0.32066  -0.02349  -0.03099   0.33085  21.20516
# Model without interaction term (Hypothesis testing: Likelihood ratio test (LRT))
dds_lrt<- DESeq(dds, test = "LRT", reduced = ~ Sex + Treatment)
lrt<- results(dds_lrt)   
summary(lrt)
## 
## out of 41594 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3, 0.0072%
## LFC < 0 (down)     : 1, 0.0024%
## outliers [1]       : 21, 0.05%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lrt05<- results(dds_lrt, alpha = 0.05)
summary(lrt05) 
## 
## out of 41594 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 1, 0.0024%
## outliers [1]       : 21, 0.05%
## low counts [2]     : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Distribution of M & F interaction LFC
hist(res_int_raw$log2FoldChange, breaks=80)

Heatmaps of top 40 genes per contrast

Interaction heatmap shows the “top 40 by smallest FDR”

top_heatmap<- function(res_shr, vsd, label, n=40) {
  keep<- order(res_shr$padj, na.last = NA)[1:min(n, sum(!is.na(res_shr$padj)))]
  genes<- rownames(res_shr)[keep]
  mat<- assay(vsd)[genes, , drop=FALSE]
  mat<- scale(t(scale(t(mat))))  
  ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
  pheatmap(mat, annotation_col = ann, show_rownames = TRUE,
           main = paste0("Top ", n, " DE genes: ", label))
}

top_heatmap(res_F_shr, vsd, "F Ang-II vs Saline")

top_heatmap(res_M_shr, vsd, "M Ang-II vs Saline")

top_heatmap(res_int_shr, vsd, "Interaction (M & F)")

top_heatmap(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")

# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")

# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")

top_heatmap_sig<- function(res_shr, vsd, label, n=40, alpha=0.05){
  sig<- which(!is.na(res_shr$padj) & res_shr$padj <= alpha)
  if (length(sig) == 0) {
    message("No genes pass FDR <= ", alpha, " for: ", label)
    return(invisible(NULL))
  }
  keep<- head(sig[order(res_shr$padj[sig])], n)
  genes<- rownames(res_shr)[keep]
  mat<- assay(vsd)[genes, , drop=FALSE]
  mat<- scale(t(scale(t(mat))))
  ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
  pheatmap(mat, annotation_col=ann, show_rownames=TRUE,
           main=sprintf("Top %d genes (FDR <= %.2g): %s", length(genes), alpha, label))
}

top_heatmap_sig(res_F_shr, vsd, "F Ang-II vs Saline")

top_heatmap_sig(res_M_shr, vsd, "M Ang-II vs Saline")

top_heatmap_sig(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")

sum(res_int_shr$padj <= 0.05, na.rm=TRUE)
## [1] 1
# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap_sig(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")

# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap_sig(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")

Pathway enrichment Analysis

GSEA (Hallmarks)

MSigDB “Hallmarks” (collection H) = 50 core biological with minimal overlap gene sets that capture broad, well-defined biological processes.

# Map Ensembl to gene symbols 
gtf<- file.path(idx, "Mus_musculus.GRCm39.115.gtf")
gtf_df<- as.data.frame(rtracklayer::import(gtf))

# For symbols
ann<- gtf_df %>%
  filter(type == "gene") %>%
  transmute(
    ensgene = sub("\\.\\d+$", "", gene_id),
    symbol  = gene_name) %>%
  distinct()

head(ann)
##              ensgene  symbol
## 1 ENSMUSG00000104478 Gm38212
## 2 ENSMUSG00000104385  Gm7449
## 3 ENSMUSG00000101231 Gm28283
## 4 ENSMUSG00000101097  Gm6679
## 5 ENSMUSG00000100764 Gm29155
## 6 ENSMUSG00000100831 Gm17847
# map ensembl to entrez
map_ensg_to_entrez<- function(ens_ids){
  ens_ids <- sub("\\.\\d+$","", ens_ids)
  tibble(ENSEMBL = ens_ids) %>%
    mutate(ENTREZID = AnnotationDbi::mapIds(
      org.Mm.eg.db, keys = ENSEMBL, keytype = "ENSEMBL",
      column = "ENTREZID", multiVals = "first")) %>%
    filter(!is.na(ENTREZID)) %>% distinct()
}

# rank vectors on entrez 
rankify_entrez<- function(res_raw, ann_map){
  df<- as.data.frame(res_raw) %>%
    rownames_to_column("ensgene") %>%
    mutate(ensgene = sub("\\.\\d+$","", ensgene)) %>%
    left_join(ann_map, by = c("ensgene"="ENSEMBL")) %>%
    filter(!is.na(stat), !is.na(ENTREZID))
  split(df$stat, df$ENTREZID) %>% vapply(\(v) v[which.max(abs(v))], numeric(1))
}

ens_all<- unique(c(rownames(res_F_raw), rownames(res_M_raw),
                       rownames(res_int_raw), rownames(res_bsl_raw)))
ann_entrez<- map_ensg_to_entrez(ens_all)

ranks_F_e<- rankify_entrez(res_F_raw, ann_entrez)
ranks_M_e<- rankify_entrez(res_M_raw, ann_entrez)
ranks_Int_e<- rankify_entrez(res_int_raw, ann_entrez)
ranks_Bas_e<- rankify_entrez(res_bsl_raw, ann_entrez)

# Hallmark pathways grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="H") %>%
  distinct(gs_name, entrez_gene) %>%
  group_by(gs_name) %>%
  summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
  tibble::deframe()

# overlap 
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.00   95.75  164.50  142.28  195.75  202.00
# GSEA
run_gsea<- function(ranks, label, pathways) {
  fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}

gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)", msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)", msig_entrez)

gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)

length(ranks_F_e)
## [1] 22219
sum(duplicated(names(ranks_F_e)))
## [1] 0
summary(ranks_F_e)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5.4778 -0.5577  0.2257  0.2475  1.0306  6.2055

Context:

4 contrasts And their interpretation

  1. Baseline (M vs F) This is Sex_M_vs_F, so its Male − Female Red (NES > 0): pathway genes are more highly expressed in males than females. Blue (NES < 0): pathway genes are lower in males, so higher in females.

  2. F Ang-II vs Saline This is Treatment_Ang.II_vs_Saline with F as baseline sex = (Ang-II females) − (Saline
    females) Red: pathway enriched in Ang-II females (treatment turned it up). Blue: pathway lower in Ang-II females,so relatively higher in Saline females.

3.M Ang-II vs Saline This IS the main + interaction = (Ang-II males) − (Saline males) Red: pathway up in Ang-II males. Blue: pathway down in Ang-II males = higher in Saline males.

  1. Interaction (M & F) This is SexM.TreatmentAng.II, which answers the question, “does Ang-II change this pathway
    more in males than females?” (main effect of treatment is reference to females) Red: Ang-II effect is stronger in males than in females. Blue: Ang-II effect is stronger in females than in males
alpha<- 0.05

top_k<- gsea_all %>%
  dplyr::filter(!is.na(padj), padj <= alpha) %>%              
  dplyr::group_by(contrast) %>%
  dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>% 
  dplyr::slice_head(n = 10) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    path = gsub("^HALLMARK_", "", pathway),
    path = gsub("_", " ", path),
    id   = paste(contrast, path, sep = "|")                   
  ) %>%
  dplyr::group_by(contrast) %>%
  dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>% 
  dplyr::ungroup()

wrap_lab<- function(x, width = 32) {
  vapply(x, function(s) paste(strwrap(s, width = width), collapse = "\n"), character(1))
}


ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
  geom_col() +coord_flip() +
  facet_wrap(~ contrast, scales = "free_y") +
  scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
  scale_fill_manual(
    values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
    name   = "Direction",
    labels = c("FALSE" = "Enriched in genes lower in this contrast",
               "TRUE"  = "Enriched in genes higher in this contrast")) +
  labs(
    title = sprintf("Top 10 Hallmark pathways per contrast (FDR <=0.05)", alpha),
    x = NULL, y = "Normalised Enrichment Score (NES)") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "bottom")

# GO: BP pathways (msigdbr), grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="C5", subcategory = "GO:BP")%>%
  distinct(gs_name, entrez_gene) %>%
  group_by(gs_name) %>%
  summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
  tibble::deframe()

# overlap 
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     8.0    18.0    73.3    54.0  1825.0
# GSEA
run_gsea<- function(ranks, label, pathways) {
  fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}

gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)",  msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)",  msig_entrez)

gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)

length(ranks_F_e)
## [1] 22219
sum(duplicated(names(ranks_F_e)))
## [1] 0
summary(ranks_F_e)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5.4778 -0.5577  0.2257  0.2475  1.0306  6.2055
top_k<- gsea_all %>%
  dplyr::filter(!is.na(padj), padj <= alpha) %>%              
  dplyr::group_by(contrast) %>%
  dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>%  
  dplyr::slice_head(n = 10) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    path_label = gsub("^GOBP_", "", pathway),
    path_label = gsub("_", " ", path_label),
    id = paste(contrast, path_label, sep = "|")) %>%
  dplyr::group_by(contrast) %>%
  dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>%
  dplyr::ungroup()

ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ contrast, scales = "free_y") +
  scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
  scale_fill_manual(
    values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
    name   = "Direction",
    labels = c("FALSE" = "Enriched among genes lower in this contrast",
               "TRUE"  = "Enriched among genes higher in this contrast")) +
  labs(title = sprintf("Top 10 GO:BP pathways per contrast (FDR <=0.05)", alpha),
    subtitle = NULL,
    x = NULL, y = "Normalised Enrichment Score (NES)") +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "bottom",
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold"))

Mitch

deseq_to_prescore <- function(res_obj) {
  as.data.frame(res_obj) %>%
    rownames_to_column("ensgene") %>%
    mutate(ensgene = sub("\\.\\d+$", "", ensgene)) %>%
    filter(!is.na(stat)) %>%
    dplyr::select(ensgene, score = stat) %>%
    group_by(ensgene) %>%
    slice_max(order_by = abs(score), n = 1) %>%
    ungroup() %>%
    column_to_rownames("ensgene")
}

x_F<- deseq_to_prescore(res_F_raw)
x_M <- deseq_to_prescore(res_M_raw)
x_Int<- deseq_to_prescore(res_int_raw)
x_Bas<- deseq_to_prescore(res_bsl_raw)

geneTable<- ann %>%
  filter(!is.na(symbol), symbol != "") %>%
  dplyr::select(gene = symbol, ensgene)

# Hallmark
msig_hall<- msigdbr(
  species= "Mus musculus",
  category= "H") %>%
  dplyr::select(gs_name, gene_symbol) %>%
  distinct() %>%
  group_by(gs_name) %>%
  summarise(genes = list(gene_symbol), .groups = "drop") %>%
  tibble::deframe()

# GO:BP
msig_gobp<- msigdbr(
  species= "Mus musculus",
  category= "C5",
  subcategory= "GO:BP") %>%
  dplyr::select(gs_name, gene_symbol) %>%
  distinct() %>%
  group_by(gs_name) %>%
  summarise(genes = list(gene_symbol), .groups = "drop") %>%
  tibble::deframe()

# run mitch for one contrast
run_mitch_one<- function(x_mat, genesets, label) {
  mobj <- mitch_import(
    x= x_mat,
    DEtype= "prescored",
    geneTable= geneTable)
  mres<- mitch_calc(
    x = mobj,
    genesets= genesets,
    minsetsize= 5,
    cores= CORES,
    priority= "effect")
  out<- mres$enrichment_result %>%
    mutate(
      contrast= label,
      set_clean= gsub("_", " ", gsub("^HALLMARK_|^GOBP_", "", set)))
  return(out)
}

#4 contrasts (hallmark)
hall_F<- run_mitch_one(x_F, msig_hall, "F Ang-II vs Saline")
hall_M<- run_mitch_one(x_M, msig_hall, "M Ang-II vs Saline")
hall_Int<- run_mitch_one(x_Int, msig_hall, "Interaction (M vs F)")
hall_Bas<- run_mitch_one(x_Bas, msig_hall, "Baseline M vs F")

hall_all<- bind_rows(hall_F, hall_M, hall_Int, hall_Bas)

#sig + top 10 per contrast
hall_top10<- hall_all %>%
  filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
  arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
  group_by(contrast) %>%
  slice_head(n = 10) %>%
  ungroup()

hall_top10
## # A tibble: 40 × 7
##    set                  setSize   pANOVA s.dist p.adjustANOVA contrast set_clean
##    <chr>                  <int>    <dbl>  <dbl>         <dbl> <chr>    <chr>    
##  1 HALLMARK_OXIDATIVE_…     185 2.26e-52  0.647      1.13e-50 Baselin… OXIDATIV…
##  2 HALLMARK_ADIPOGENES…     197 2.49e-36  0.519      6.24e-35 Baselin… ADIPOGEN…
##  3 HALLMARK_HEME_METAB…     193 1.75e-23  0.416      2.42e-22 Baselin… HEME MET…
##  4 HALLMARK_MTORC1_SIG…     197 1.93e-23  0.411      2.42e-22 Baselin… MTORC1 S…
##  5 HALLMARK_GLYCOLYSIS      194 1.63e-22  0.405      1.63e-21 Baselin… GLYCOLYS…
##  6 HALLMARK_FATTY_ACID…     151 6.94e-20  0.429      5.79e-19 Baselin… FATTY AC…
##  7 HALLMARK_P53_PATHWAY     196 1.13e-19  0.375      8.08e-19 Baselin… P53 PATH…
##  8 HALLMARK_MYOGENESIS      197 1.69e-18  0.362      1.06e-17 Baselin… MYOGENES…
##  9 HALLMARK_DNA_REPAIR      149 3.10e-17  0.400      1.72e-16 Baselin… DNA REPA…
## 10 HALLMARK_TNFA_SIGNA…     198 8.51e-17  0.342      4.25e-16 Baselin… TNFA SIG…
## # ℹ 30 more rows
#4 contrasts(GO:BP)
gobp_F<- run_mitch_one(x_F, msig_gobp, "F Ang-II vs Saline")
gobp_M<- run_mitch_one(x_M, msig_gobp, "M Ang-II vs Saline")
gobp_Int<- run_mitch_one(x_Int, msig_gobp, "Interaction (M vs F)")
gobp_Bas<- run_mitch_one(x_Bas, msig_gobp, "Baseline M vs F")

gobp_all<- bind_rows(gobp_F, gobp_M, gobp_Int, gobp_Bas)

gobp_top10<- gobp_all %>%
  filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
  arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
  group_by(contrast) %>%
  slice_head(n = 10) %>%
  ungroup()

gobp_top10
## # A tibble: 40 × 7
##    set                 setSize    pANOVA s.dist p.adjustANOVA contrast set_clean
##    <chr>                 <int>     <dbl>  <dbl>         <dbl> <chr>    <chr>    
##  1 GOBP_PHOSPHORUS_ME…    1769 1.49e-120  0.327     1.09e-116 Baselin… PHOSPHOR…
##  2 GOBP_SMALL_MOLECUL…    1601 6.18e-104  0.318     2.26e-100 Baselin… SMALL MO…
##  3 GOBP_VESICLE_MEDIA…    1496 1.42e-102  0.326     3.46e- 99 Baselin… VESICLE …
##  4 GOBP_ESTABLISHMENT…    1631 8.05e-101  0.310     1.47e- 97 Baselin… ESTABLIS…
##  5 GOBP_PROTEIN_CONTA…    1710 6.46e- 94  0.292     9.44e- 91 Baselin… PROTEIN …
##  6 GOBP_REGULATION_OF…    1433 4.20e- 93  0.317     5.13e- 90 Baselin… REGULATI…
##  7 GOBP_NITROGEN_COMP…    1783 3.51e- 90  0.281     3.67e- 87 Baselin… NITROGEN…
##  8 GOBP_APOPTOTIC_PRO…    1764 2.72e- 88  0.279     2.49e- 85 Baselin… APOPTOTI…
##  9 GOBP_MACROMOLECULE…    1397 3.11e- 80  0.297     2.53e- 77 Baselin… MACROMOL…
## 10 GOBP_POSITIVE_REGU…    1704 4.37e- 80  0.270     3.20e- 77 Baselin… POSITIVE…
## # ℹ 30 more rows
# Hallmark pathways

hall_top10$contrast<- factor(
  hall_top10$contrast,
  levels= c(
    "F Ang-II vs Saline",
    "M Ang-II vs Saline",
    "Interaction (M vs F)",
    "Baseline M vs F"
  )
)

ggplot(hall_top10,
       aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ contrast, scales = "free_y") +
  scale_fill_manual(
    values= c("TRUE" = "firebrick", "FALSE" = "steelblue"),
    name= "Direction",
    labels= c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
  labs(
    title = "mitch: Hallmark pathways (top 10 per contrast)",
    x = NULL,
    y = "mitch s.dist") +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "bottom",
    strip.text = element_text(face = "bold"))

# GO:BP pathways
gobp_top10$contrast<- factor(
  gobp_top10$contrast,
  levels = c(
    "F Ang-II vs Saline",
    "M Ang-II vs Saline",
    "Interaction (M & F)",
    "Baseline M vs F"
  )
)

ggplot(gobp_top10,
       aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ contrast, scales = "free_y") +
  scale_fill_manual(
    values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
    name   = "Direction",
    labels = c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
  labs(
    title = "mitch: GO:BP pathways (top 10 per contrast)",
    x = NULL,
    y = "mitch s.dist") +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "bottom",
    strip.text = element_text(face = "bold"))

sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_AU.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_AU.UTF-8        LC_COLLATE=en_AU.UTF-8    
##  [5] LC_MONETARY=en_AU.UTF-8    LC_MESSAGES=en_AU.UTF-8   
##  [7] LC_PAPER=en_AU.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] fgsea_1.32.4                                       
##  [2] org.Mm.eg.db_3.20.0                                
##  [3] rtracklayer_1.66.0                                 
##  [4] mitch_1.18.4                                       
##  [5] msigdbr_25.1.1                                     
##  [6] pheatmap_1.0.13                                    
##  [7] plotly_4.11.0                                      
##  [8] e1071_1.7-16                                       
##  [9] RColorBrewer_1.1-3                                 
## [10] magrittr_2.0.3                                     
## [11] lubridate_1.9.4                                    
## [12] forcats_1.0.1                                      
## [13] stringr_1.5.1                                      
## [14] dplyr_1.1.4                                        
## [15] readr_2.1.5                                        
## [16] tidyr_1.3.1                                        
## [17] tibble_3.2.1                                       
## [18] tidyverse_2.0.0                                    
## [19] readxl_1.4.5                                       
## [20] AnnotationDbi_1.68.0                               
## [21] purrr_1.1.0                                        
## [22] metafor_4.8-0                                      
## [23] numDeriv_2016.8-1.1                                
## [24] metadat_1.4-0                                      
## [25] Matrix_1.7-4                                       
## [26] png_0.1-8                                          
## [27] gridExtra_2.3                                      
## [28] missMethyl_1.40.3                                  
## [29] IlluminaHumanMethylationEPICv2anno.20a1.hg38_1.0.0 
## [30] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [31] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1 
## [32] minfi_1.52.1                                       
## [33] bumphunter_1.48.0                                  
## [34] locfit_1.5-9.12                                    
## [35] iterators_1.0.14                                   
## [36] foreach_1.5.2                                      
## [37] Biostrings_2.74.1                                  
## [38] XVector_0.46.0                                     
## [39] beeswarm_0.4.0                                     
## [40] kableExtra_1.4.0                                   
## [41] tictoc_1.2.1                                       
## [42] HGNChelper_0.8.15                                  
## [43] DESeq2_1.46.0                                      
## [44] SummarizedExperiment_1.36.0                        
## [45] Biobase_2.66.0                                     
## [46] MatrixGenerics_1.18.1                              
## [47] matrixStats_1.5.0                                  
## [48] GenomicRanges_1.58.0                               
## [49] GenomeInfoDb_1.42.3                                
## [50] IRanges_2.40.1                                     
## [51] S4Vectors_0.44.0                                   
## [52] BiocGenerics_0.52.0                                
## [53] eulerr_7.0.4                                       
## [54] apeglm_1.28.0                                      
## [55] tximport_1.34.0                                    
## [56] annotables_0.2.0                                   
## [57] biomaRt_2.62.1                                     
## [58] limma_3.62.2                                       
## [59] EnhancedVolcano_1.24.0                             
## [60] ggrepel_0.9.6                                      
## [61] ggplot2_4.0.0                                      
## 
## loaded via a namespace (and not attached):
##   [1] bitops_1.0-9              httr_1.4.7               
##   [3] tools_4.4.3               doRNG_1.8.6.2            
##   [5] utf8_1.2.4                R6_2.6.1                 
##   [7] HDF5Array_1.34.0          lazyeval_0.2.2           
##   [9] rhdf5filters_1.18.1       withr_3.0.2              
##  [11] prettyunits_1.2.0         GGally_2.4.0             
##  [13] base64_2.0.2              preprocessCore_1.68.0    
##  [15] cli_3.6.5                 textshaping_1.0.4        
##  [17] labeling_0.4.3            sass_0.4.10              
##  [19] SQUAREM_2021.1            mvtnorm_1.3-3            
##  [21] S7_0.2.0                  genefilter_1.88.0        
##  [23] proxy_0.4-27              askpass_1.2.1            
##  [25] mixsqp_0.3-54             Rsamtools_2.22.0         
##  [27] systemfonts_1.3.1         siggenes_1.80.0          
##  [29] illuminaio_0.48.0         svglite_2.2.2            
##  [31] rentrez_1.2.4             dichromat_2.0-0.1        
##  [33] scrime_1.3.5              invgamma_1.2             
##  [35] bbmle_1.0.25.1            rstudioapi_0.17.1        
##  [37] RSQLite_2.4.3             generics_0.1.3           
##  [39] BiocIO_1.16.0             vroom_1.6.5              
##  [41] gtools_3.9.5              abind_1.4-8              
##  [43] lifecycle_1.0.4           yaml_2.3.10              
##  [45] mathjaxr_1.8-0            gplots_3.2.0             
##  [47] rhdf5_2.50.2              SparseArray_1.6.2        
##  [49] BiocFileCache_2.14.0      grid_4.4.3               
##  [51] blob_1.2.4                promises_1.4.0           
##  [53] crayon_1.5.3              bdsmatrix_1.3-7          
##  [55] lattice_0.22-5            cowplot_1.2.0            
##  [57] echarts4r_0.4.6           GenomicFeatures_1.58.0   
##  [59] annotate_1.84.0           KEGGREST_1.46.0          
##  [61] pillar_1.11.1             knitr_1.50               
##  [63] beanplot_1.3.1            rjson_0.2.23             
##  [65] codetools_0.2-19          fastmatch_1.1-6          
##  [67] glue_1.8.0                data.table_1.17.8        
##  [69] vctrs_0.6.5               cellranger_1.1.0         
##  [71] gtable_0.3.6              assertthat_0.2.1         
##  [73] emdbook_1.3.14            cachem_1.1.0             
##  [75] xfun_0.54                 mime_0.13                
##  [77] S4Arrays_1.6.0            coda_0.19-4.1            
##  [79] survival_3.8-3            statmod_1.5.0            
##  [81] nlme_3.1-168              bit64_4.6.0-1            
##  [83] progress_1.2.3            filelock_1.0.3           
##  [85] bslib_0.9.0               nor1mix_1.3-3            
##  [87] irlba_2.3.5.1             KernSmooth_2.23-26       
##  [89] otel_0.2.0                splitstackshape_1.4.8    
##  [91] DBI_1.2.3                 tidyselect_1.2.1         
##  [93] bit_4.6.0                 compiler_4.4.3           
##  [95] curl_7.0.0                httr2_1.2.1              
##  [97] xml2_1.4.1                DelayedArray_0.32.0      
##  [99] caTools_1.18.3            scales_1.4.0             
## [101] quadprog_1.5-8            rappdirs_0.3.3           
## [103] digest_0.6.37             rmarkdown_2.30           
## [105] GEOquery_2.74.0           htmltools_0.5.8.1        
## [107] pkgconfig_2.0.3           sparseMatrixStats_1.18.0 
## [109] dbplyr_2.5.1              fastmap_1.2.0            
## [111] rlang_1.1.6               htmlwidgets_1.6.4        
## [113] UCSC.utils_1.2.0          shiny_1.11.1             
## [115] DelayedMatrixStats_1.28.1 farver_2.1.2             
## [117] jquerylib_0.1.4           jsonlite_2.0.0           
## [119] BiocParallel_1.40.2       mclust_6.1.1             
## [121] RCurl_1.98-1.17           GenomeInfoDbData_1.2.13  
## [123] Rhdf5lib_1.28.0           Rcpp_1.1.0               
## [125] babelgene_22.9            stringi_1.8.7            
## [127] zlibbioc_1.52.0           MASS_7.3-65              
## [129] plyr_1.8.9                org.Hs.eg.db_3.20.0      
## [131] ggstats_0.11.0            splines_4.4.3            
## [133] multtest_2.62.0           hms_1.1.3                
## [135] rngtools_1.5.2            reshape2_1.4.4           
## [137] XML_3.99-0.19             evaluate_1.0.5           
## [139] tzdb_0.4.0                httpuv_1.6.16            
## [141] openssl_2.3.4             reshape_0.8.10           
## [143] ashr_2.2-67               xtable_1.8-4             
## [145] restfulr_0.0.16           later_1.4.4              
## [147] viridisLite_0.4.2         class_7.3-23             
## [149] truncnorm_1.0-9           memoise_2.0.1            
## [151] GenomicAlignments_1.42.0  timechange_0.3.0